import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
df = pd.read_csv("stock_tweets.csv")
df.head()
| Date | Tweet | Stock Name | Company Name | |
|---|---|---|---|---|
| 0 | 2022-09-29 23:41:16+00:00 | Mainstream media has done an amazing job at br... | TSLA | Tesla, Inc. |
| 1 | 2022-09-29 23:24:43+00:00 | Tesla delivery estimates are at around 364k fr... | TSLA | Tesla, Inc. |
| 2 | 2022-09-29 23:18:08+00:00 | 3/ Even if I include 63.0M unvested RSUs as of... | TSLA | Tesla, Inc. |
| 3 | 2022-09-29 22:40:07+00:00 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | TSLA | Tesla, Inc. |
| 4 | 2022-09-29 22:27:05+00:00 | @RealDanODowd @Tesla Stop trying to kill kids,... | TSLA | Tesla, Inc. |
df.dtypes
Date object Tweet object Stock Name object Company Name object dtype: object
df.shape
(80793, 4)
# Filter the dataframe for rows where the company is 'Tesla'
df = df[df['Company Name'] == 'Tesla, Inc.'].copy()
df
| Date | Tweet | Stock Name | Company Name | |
|---|---|---|---|---|
| 0 | 2022-09-29 23:41:16+00:00 | Mainstream media has done an amazing job at br... | TSLA | Tesla, Inc. |
| 1 | 2022-09-29 23:24:43+00:00 | Tesla delivery estimates are at around 364k fr... | TSLA | Tesla, Inc. |
| 2 | 2022-09-29 23:18:08+00:00 | 3/ Even if I include 63.0M unvested RSUs as of... | TSLA | Tesla, Inc. |
| 3 | 2022-09-29 22:40:07+00:00 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | TSLA | Tesla, Inc. |
| 4 | 2022-09-29 22:27:05+00:00 | @RealDanODowd @Tesla Stop trying to kill kids,... | TSLA | Tesla, Inc. |
| ... | ... | ... | ... | ... |
| 37417 | 2021-09-30 02:52:38+00:00 | Playing in the dirt and #chasingsunsets\n@tesl... | TSLA | Tesla, Inc. |
| 37418 | 2021-09-30 02:40:26+00:00 | I agree with @freshjiva that $TSLA ‘s EV busin... | TSLA | Tesla, Inc. |
| 37419 | 2021-09-30 01:59:02+00:00 | Hold. On. Tight. $TSLA | TSLA | Tesla, Inc. |
| 37420 | 2021-09-30 01:38:26+00:00 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | TSLA | Tesla, Inc. |
| 37421 | 2021-09-30 01:16:13+00:00 | In other words, AMD has been giving Tesla pref... | TSLA | Tesla, Inc. |
37422 rows × 4 columns
df.shape
(37422, 4)
# Convert "Published Date" to datetime format
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes
Date datetime64[ns, UTC] Tweet object Stock Name object Company Name object dtype: object
# Format the 'Date' column to include only the date part (yyyy-mm-dd)
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
df.head()
| Date | Tweet | Stock Name | Company Name | |
|---|---|---|---|---|
| 0 | 2022-09-29 | Mainstream media has done an amazing job at br... | TSLA | Tesla, Inc. |
| 1 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... | TSLA | Tesla, Inc. |
| 2 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... | TSLA | Tesla, Inc. |
| 3 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | TSLA | Tesla, Inc. |
| 4 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... | TSLA | Tesla, Inc. |
df.shape
(37422, 4)
# Remove repeated rows
df = df.drop_duplicates()
df.shape
(37413, 4)
df.isnull().sum()
Date 0 Tweet 0 Stock Name 0 Company Name 0 dtype: int64
df.drop(['Stock Name', 'Company Name'], axis=1, inplace=True)
df.head()
<ipython-input-13-cba4765805bd>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.drop(['Stock Name', 'Company Name'], axis=1, inplace=True)
| Date | Tweet | |
|---|---|---|
| 0 | 2022-09-29 | Mainstream media has done an amazing job at br... |
| 1 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... |
| 2 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... |
| 3 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... |
| 4 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... |
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date!
True
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
from nltk.tokenize import word_tokenize
# Function to plot frequent punctuation marks
def plot_punctuation_frequency(text, top_n=20):
# Find all punctuation marks
punctuation = re.findall(r'[^\w\s]', text)
# Count the frequency of each punctuation mark
punctuation_count = Counter(punctuation)
# Get the most common punctuation marks
most_common_punctuation = punctuation_count.most_common(top_n)
# Unpack the keys and values for plotting
keys, values = zip(*most_common_punctuation)
# Plot the frequency
plt.figure(figsize=(18, 6))
plt.bar(keys, values)
plt.xlabel('Punctuation Marks')
plt.ylabel('Frequency')
plt.title(f'Top {top_n} Most Frequent Punctuation Marks')
plt.show()
# Function to plot frequent stop words
def plot_stopwords_frequency(text, top_n=20):
# Tokenize the text
tokens = word_tokenize(text)
# Find stop words
stop_words = set(stopwords.words('english'))
stop_words_found = [word for word in tokens if word in stop_words]
# Count the frequency of each stop word
stop_words_count = Counter(stop_words_found)
# Get the most common stop words
most_common_stopwords = stop_words_count.most_common(top_n)
# Unpack the keys and values for plotting
keys, values = zip(*most_common_stopwords)
# Plot the frequency
plt.figure(figsize=(18, 6))
plt.bar(keys, values)
plt.xlabel('Stop Words')
plt.ylabel('Frequency')
plt.title(f'Top {top_n} Most Frequent Stop Words')
plt.show()
# Function to generate word cloud
def generate_word_cloud(text):
wordcloud = WordCloud(width=1200, height=600,
background_color='white',
stopwords=set(stopwords.words('english')),
min_font_size=10).generate(text)
plt.figure(figsize=(8, 10), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
# Visualize punctuation frequency
text = ' '.join(df['Tweet'])
plot_punctuation_frequency(text)
# Visualize stop words frequency
plot_stopwords_frequency(text)
# Generate word cloud
generate_word_cloud(text)
# Function to clean tweets
def clean_tweet(tweet):
# Remove mentioned words
tweet = re.sub(r'@\w+', '', tweet)
# Remove links
tweet = re.sub(r'http\S+|www.\S+', '', tweet)
# Remove extra spaces
tweet = re.sub(r'\s+', ' ', tweet).strip()
return tweet
# Apply the cleaning function to the 'Tweet' column
df['Cleaned Tweet'] = df['Tweet'].apply(clean_tweet)
# Display the modified dataframe
df[['Date', 'Tweet', 'Cleaned Tweet']]
<ipython-input-19-759f2852dc7c>:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Cleaned Tweet'] = df['Tweet'].apply(clean_tweet)
| Date | Tweet | Cleaned Tweet | |
|---|---|---|---|
| 0 | 2022-09-29 | Mainstream media has done an amazing job at br... | Mainstream media has done an amazing job at br... |
| 1 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... | Tesla delivery estimates are at around 364k fr... |
| 2 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... | 3/ Even if I include 63.0M unvested RSUs as of... |
| 3 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | Hahaha why are you still trying to stop Tesla ... |
| 4 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... | Stop trying to kill kids, you sad deranged old... |
| ... | ... | ... | ... |
| 37417 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... | Playing in the dirt and |
| 37418 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | I agree with that $TSLA ‘s EV business alone i... |
| 37419 | 2021-09-30 | Hold. On. Tight. $TSLA | Hold. On. Tight. $TSLA |
| 37420 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery ... |
| 37421 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | In other words, AMD has been giving Tesla pref... |
37413 rows × 3 columns
from nltk import pos_tag
# Preprocessing function
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
# Remove digits
text = re.sub(r'\d+', '', text)
# Tokenize the text
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
# POS tagging
pos_tags = pos_tag(filtered_tokens)
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = []
for word, tag in pos_tags:
if tag.startswith('NN'): # Noun
lemma = lemmatizer.lemmatize(word, pos='n')
elif tag.startswith('VB'): # Verb
lemma = lemmatizer.lemmatize(word, pos='v')
elif tag.startswith('JJ'): # Adjective
lemma = lemmatizer.lemmatize(word, pos='a')
else:
lemma = word
lemmatized_text.append(lemma)
# Join the lemmatized tokens back into a single string
preprocessed_text = ' '.join(lemmatized_text)
return preprocessed_text
# Apply preprocessing to 'Title' and 'Content' columns
df['Cleaned Tweet'] = df['Cleaned Tweet'].apply(preprocess_text)
# Display the preprocessed DataFrame
df
<ipython-input-21-04db43ba880b>:36: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Cleaned Tweet'] = df['Cleaned Tweet'].apply(preprocess_text)
| Date | Tweet | Cleaned Tweet | |
|---|---|---|---|
| 0 | 2022-09-29 | Mainstream media has done an amazing job at br... | mainstream medium do amazing job brainwash peo... |
| 1 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... | tesla delivery estimate around k analyst tsla |
| 2 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... | even include unvested rsus additional equity n... |
| 3 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | hahaha still try stop tesla fsd bro get shit t... |
| 4 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... | stop try kill kid sad derange old man |
| ... | ... | ... | ... |
| 37417 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... | play dirt |
| 37418 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | agree tsla ev business alone worth gt sh wo fs... |
| 37419 | 2021-09-30 | Hold. On. Tight. $TSLA | hold tight tsla |
| 37420 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | get ready tsla _ _ _ _ _ _ q delivery number u... |
| 37421 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | word amd give tesla preferential treatment bc ... |
37413 rows × 3 columns
# Generate word cloud
text = ' '.join(df['Cleaned Tweet'])
generate_word_cloud(text)
# Function to clean tweets
def clean_tweet(tweet):
# Remove 'tsla' and 'tesla' (case-insensitive)
tweet = re.sub(r'\b(tsla|tesla)\b', '', tweet, flags=re.IGNORECASE)
# Remove extra spaces
tweet = re.sub(r'\s+', ' ', tweet).strip()
return tweet
# Apply the cleaning function to the 'Tweet' column
df['Cleaned Tweet'] = df['Cleaned Tweet'].apply(clean_tweet)
# Display the modified dataframe
df[['Date', 'Tweet', 'Cleaned Tweet']]
<ipython-input-23-d09bbf052268>:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Cleaned Tweet'] = df['Cleaned Tweet'].apply(clean_tweet)
| Date | Tweet | Cleaned Tweet | |
|---|---|---|---|
| 0 | 2022-09-29 | Mainstream media has done an amazing job at br... | mainstream medium do amazing job brainwash peo... |
| 1 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... | delivery estimate around k analyst |
| 2 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... | even include unvested rsus additional equity n... |
| 3 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | hahaha still try stop fsd bro get shit togethe... |
| 4 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... | stop try kill kid sad derange old man |
| ... | ... | ... | ... |
| 37417 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... | play dirt |
| 37418 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | agree ev business alone worth gt sh wo fsd lik... |
| 37419 | 2021-09-30 | Hold. On. Tight. $TSLA | hold tight |
| 37420 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | get ready _ _ _ _ _ _ q delivery number ur answer |
| 37421 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | word amd give preferential treatment bc know l... |
37413 rows × 3 columns
# Generate word cloud
text = ' '.join(df['Cleaned Tweet'])
generate_word_cloud(text)
df.to_csv('preprocessed_tweets.csv', index=False)
# prompt: scrape TSLA stock prices from 2021-09-30 to 2024-05-19 from yfinance
import yfinance as yf
import pandas as pd
# Define the ticker symbol and date range
ticker_symbol = "TSLA"
start_date = "2021-09-30"
end_date = "2024-05-19"
# Use yfinance to download historical data
historical_data = yf.download(ticker_symbol, start=start_date, end=end_date)
# Convert the data to a Pandas DataFrame
df = pd.DataFrame(historical_data)
df
# Save the DataFrame to a CSV file
df.to_csv('historical_stock_prices.csv', index=True)
[*********************100%%**********************] 1 of 1 completed
df
| Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2021-09-30 | 260.333344 | 263.043335 | 258.333344 | 258.493347 | 258.493347 | 53868000 |
| 2021-10-01 | 259.466675 | 260.260010 | 254.529999 | 258.406677 | 258.406677 | 51094200 |
| 2021-10-04 | 265.500000 | 268.989990 | 258.706665 | 260.510010 | 260.510010 | 91449900 |
| 2021-10-05 | 261.600006 | 265.769989 | 258.066681 | 260.196655 | 260.196655 | 55297800 |
| 2021-10-06 | 258.733337 | 262.220001 | 257.739990 | 260.916656 | 260.916656 | 43898400 |
| ... | ... | ... | ... | ... | ... | ... |
| 2024-05-13 | 170.000000 | 175.399994 | 169.000000 | 171.889999 | 171.889999 | 67018900 |
| 2024-05-14 | 174.500000 | 179.490005 | 174.070007 | 177.550003 | 177.550003 | 86407400 |
| 2024-05-15 | 179.899994 | 180.000000 | 173.110001 | 173.990005 | 173.990005 | 79663000 |
| 2024-05-16 | 174.100006 | 175.789993 | 171.429993 | 174.839996 | 174.839996 | 59812200 |
| 2024-05-17 | 173.550003 | 179.630005 | 172.750000 | 177.460007 | 177.460007 | 76627600 |
662 rows × 6 columns
df1.shape
(6300, 8)
df.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True)
df.head()
| Adj Close | Volume | |
|---|---|---|
| Date | ||
| 2021-09-30 | 258.493347 | 53868000 |
| 2021-10-01 | 258.406677 | 51094200 |
| 2021-10-04 | 260.510010 | 91449900 |
| 2021-10-05 | 260.196655 | 55297800 |
| 2021-10-06 | 260.916656 | 43898400 |
df.shape
(662, 2)
# @title Volume
from matplotlib import pyplot as plt
df['Volume'].plot(kind='line', figsize=(8, 4), title='Volume')
plt.gca().spines[['top', 'right']].set_visible(False)
from matplotlib import pyplot as plt
df['Adj Close'].plot(kind='hist', bins=20, title='Adj Close')
plt.gca().spines[['top', 'right',]].set_visible(False)
df['Adj Close'].plot(kind='line', figsize=(8, 4), title='Adjusted Closing Price')
plt.gca().spines[['top', 'right']].set_visible(False)
df.isnull().sum()
Adj Close 0 Volume 0 dtype: int64
df.to_csv('historical_prices.csv', index=True)